1 Predict the weather using RandomForest

	1.1 Creating and running a machine learning pipeline
		
		spark-submit --class ca.training.bigdata.spark.ml.WeatherPrediction --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

2 Sentence Similarity using Word2Vec

	2.1 Download GoogleNews-vestors-negative300.bin.gz (Run at your sandbox /root)
		
		wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
			
	2.2 Upload GoogleNews-vestors-negative300.bin.gz into /root on your sandbox

	2.3 Login localhost:4200 using root

		gunzip GoogleNews-vectors-negative300.bin.gz		
			
	2.4 Convert GoogleNews-vestors-negative300.bin.gz into GoogleNews-vestors-negative300.bin.tsv

		2.4.1 git clone https://github.com/marekrei/convertvec.git			

		2.4.2 cd /root/convertvc

		2.4.3 make the following change on makefile
		
			replace -Ofast with -O2

		2.4.4 run the following command

			make 
			
		2.4.5 convert the file	
		
			cd /root
			/root/convertvec/convertvec bin2txt GoogleNews-vectors-negative300.bin GoogleNews-vectors-negative300.tsv
			
	2.5 Run the following command in the command-line terminal:

		hadoop fs -put /root/TrainingOnHDP/dataset/spark/sentence_pairs.txt /tmp
		hadoop fs -put /root/TrainingOnHDP/dataset/spark/stopwords.txt /tmp

		
	2.6 Run the following command:

		spark-submit --class ca.training.bigdata.spark.ml.SentenceSimilarity --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
			

3 Predict the power plant using XGBoost

	spark-submit --class ca.training.bigdata.spark.ml.PowerPlantPrediction --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
			

4 Predict the power plant using XGBoost and Spark ML Pipeline

	spark-submit --class ca.training.bigdata.spark.ml.PowerPlantPredictionPipeline --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

	
5 Machine learning application for textual analysis

	5.1 Data Preparation
	
		5.1.1 Download the following files to /root 
		
			wget https://www.sec.gov/Archives/edgar/data/320193/000119312514383437/0001193125-14-383437.txt
			wget https://raw.githubusercontent.com/iammrhelo/edgar-10k-sa/master/LoughranMcDonald_MasterDictionary_2014.csv
			wget https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz
			gunzip reuters21578.tar.gz
			tar -xvf reuters21578.tar
			wget https://archive.ics.uci.edu/ml/machine-learning-databases/00239/corpus.zip
			unzip -x corpus.zip
			wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
			gunzip reviews_Electronics_5.json.gz
			wget https://archive.ics.uci.edu/ml/machine-learning-databases/00233/CNAE-9.data
			
			
	5.2 Textual analysis using Spark ML Pipeline
		
		spark-submit --class ca.training.bigdata.spark.ml.TextualAnalysis --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
		spark-submit --class ca.training.bigdata.spark.ml.TextualAnalysisPipeline --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
		

	5.3 Topic Modeling using LDA

		spark-submit --class ca.training.bigdata.spark.ml.TopicModelingLDA --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	
	
	5.4 Predict Amazon Review using Naive Bayes

		spark-submit --class ca.training.bigdata.spark.ml.AmazonReviewPrediction --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	
	5.5 Logistic Regression Example
	
		spark-submit --class ca.training.bigdata.spark.ml.LogisticRegressionExample --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	

6 Handwritten digit recognition using Decision Tree


	spark-submit --class ca.training.bigdata.spark.ml.HandwrittenDigitRecognition --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

	
7 NLP using KMeans

	7.1 Prepare Data
		
		wget https://archive.ics.uci.edu/ml/machine-learning-databases/20newsgroups-mld/20_newsgroups.tar.gz
		gunzip 20_newsgroups.tar.gz
		tar -xvf 20_newsgroups.tar
		hadoop fs -mkdir /tmp/20_newsgroups
		hadoop fs -put /root/20_newsgroups /tmp/20_newsgroups
		
	7.2 Run the following command
	
		spark-submit --class ca.training.bigdata.spark.ml.NLPKMeans --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

		
7 Movie Recommendation using ALS
	
	spark-submit --class ca.training.bigdata.spark.ml.MovieRecommendation --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	
	
8 Churning Prediction using Decision Tree

	spark-submit --class ca.training.bigdata.spark.ml.ChurningPrediction --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

	
9 Bike Sharing Prediction using Generalized Linear Regression

	spark-submit --class ca.training.bigdata.spark.ml.BikeSharingPrediction --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar


10 Credit Risk Prediction using Random Forest

	spark-submit --class ca.training.bigdata.spark.ml.CreditRiskPrediction --driver-memory 2G --executor-memory 2G --master local[1] /root/TrainingOnHDP/MachineLearningOnSpark/target/MachineLearningOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	
	
